# check if 'librarian' is installed and if not, install itif (!"librarian"%in%rownames(installed.packages()) ){install.packages("librarian")}# load packages if not already loadedlibrarian::shelf( tidyverse, magrittr, gt, gtExtras, tidymodels, DataExplorer, skimr, janitor, ggplot2)
Warning: package 'gt' was built under R version 4.3.3
Warning: package 'modeldata' was built under R version 4.3.3
# library(magrittr) # the pipe# library(tidyverse) # for data wrangling + visualization# library(tidymodels) # for modeling# library(gt) # for pretty tablestheme_set(theme_bw(base_size =12))boston_cocktails <- readr::read_csv('../data/boston_cocktails.csv', show_col_types =FALSE)
Data: The Boston Cocktail Recipes
The Boston Cocktail Recipes dataset appeared in a TidyTuesday posting. TidyTuesday is a weekly data project in R.
The dataset is derived from the Mr. Boston Bartender’s Guide, together with a dataset that was web-scraped as part of a hackathon.
This dataset contains the following information for each cocktail:
variable
class
description
name
character
Name of cocktail
category
character
Category of cocktail
row_id
integer
Drink identifier
ingredient_number
integer
Ingredient number
ingredient
character
Ingredient
measure
character
Measurement/volume of ingredient
measure_number
real
measure as a number
Exercises
Exercise 1
First use skimr::skim and DataExplorer::introduce to assess the quality of the data set.
Next prepare a summary. What is the median measure number across cocktail recipes?
name category row_id ingredient_number
Length:2542 Length:2542 Min. : 1.0 Min. :1.000
Class :character Class :character 1st Qu.:261.2 1st Qu.:1.000
Mode :character Mode :character Median :497.0 Median :2.000
Mean :495.4 Mean :2.545
3rd Qu.:730.8 3rd Qu.:3.000
Max. :990.0 Max. :6.000
ingredient measure measure_number
Length:2542 Length:2542 Min. : 0.0200
Class :character Class :character 1st Qu.: 0.5000
Mode :character Mode :character Median : 1.0000
Mean : 0.9797
3rd Qu.: 1.5000
Max. :16.0000
The median measure is 1.0. Note that the dimensions are identified as ounces (oz) in the measure column.
Exercise 2
From the boston_cocktails dataset select the name, category, ingredient, and measure_number columns and then pivot the table to create a column for each ingredient. Fill any missing values with the number zero.
Since the names of the new columns may contain spaces, clean them using the janitor::clean_names(). Finally drop any rows with NA values and save this new dataset in a variable.
How much gin is in the cocktail called Leap Frog Highball?
SOLUTION:
cocktails_df <- boston_cocktails %>%# select the columns (by de-selecting the ones we don't want) dplyr::select(-ingredient_number, -row_id, -measure) %>%# pivot wider (make more columns); use zeros in place of NA values tidyr::pivot_wider(names_from = ingredient , values_from = measure_number , values_fill =0 ) %>% janitor::clean_names() %>% tidyr::drop_na()# show the table in the documentcocktails_df
cocktails_df %>%# filter for the desired cocktail dplyr::filter(name =='Leap Frog Highball') %>% dplyr::pull(gin)
[1] 2
Two ounces (oz) of gin are in the Leap Frog Highball.
Exercise 3
Prepare a recipes::recipe object without a target but give name and category as ‘id’ roles. Add steps to normalize the predictors and perform PCA. Finally prep the data and save it in a variable.
How many predictor variables are prepped by the recipe?
SOLUTION:
# create a recipe: y~. with an outcome/target, but here we just use ~.pca_rec <- recipes::recipe(~., data = cocktails_df) pca_rec %>%summary()
# A tibble: 42 × 4
variable type role source
<chr> <list> <chr> <chr>
1 name <chr [3]> predictor original
2 category <chr [3]> predictor original
3 light_rum <chr [2]> predictor original
4 lemon_juice <chr [2]> predictor original
5 lime_juice <chr [2]> predictor original
6 sweet_vermouth <chr [2]> predictor original
7 orange_juice <chr [2]> predictor original
8 powdered_sugar <chr [2]> predictor original
9 dark_rum <chr [2]> predictor original
10 cranberry_juice <chr [2]> predictor original
# ℹ 32 more rows
pca_rec <- pca_rec %>%# change the roles of name and category to 'id' from 'predictor' recipes::update_role(name, category, new_role ="id") %>%# normalize the remaining predictors recipes::step_normalize(all_predictors()) %>%# convert the predictors to principle components recipes::step_pca(all_predictors())# note there are 40 predictors, but that nothing has been calculated yetpca_rec %>%summary()
# A tibble: 42 × 4
variable type role source
<chr> <list> <chr> <chr>
1 name <chr [3]> id original
2 category <chr [3]> id original
3 light_rum <chr [2]> predictor original
4 lemon_juice <chr [2]> predictor original
5 lime_juice <chr [2]> predictor original
6 sweet_vermouth <chr [2]> predictor original
7 orange_juice <chr [2]> predictor original
8 powdered_sugar <chr [2]> predictor original
9 dark_rum <chr [2]> predictor original
10 cranberry_juice <chr [2]> predictor original
# ℹ 32 more rows
# calculate prepare the data per the steps in the recipepca_prep <- recipes::prep(pca_rec)pca_prep %>% summary
# A tibble: 7 × 4
variable type role source
<chr> <list> <chr> <chr>
1 name <chr [3]> id original
2 category <chr [3]> id original
3 PC1 <chr [2]> predictor derived
4 PC2 <chr [2]> predictor derived
5 PC3 <chr [2]> predictor derived
6 PC4 <chr [2]> predictor derived
7 PC5 <chr [2]> predictor derived
There are 40 predictors and 2 id variables before the data is prepped.
Once prepped, the PCA returns just 5 components by default, so we have (post-prep) 5 predictors and 2 id variables.
Exercise 4
Apply the recipes::tidy verb to the prepped recipe in the last exercise. The result is a table identifying the information generated and stored by each step in the recipe from the input data.
To see the values calculated for normalization, apply the recipes::tidy verb as before, but with second argument = 1.
What ingredient is the most used, on average?
SOLUTION:
# tidy returns a tibble with the calculations performed by preppca_prep %>% recipes::tidy()
# A tibble: 2 × 6
number operation type trained skip id
<int> <chr> <chr> <lgl> <lgl> <chr>
1 1 step normalize TRUE FALSE normalize_zkFR2
2 2 step pca TRUE FALSE pca_wpByh
# if we select the first (normalization) step we get the values calculated:# - the mean and standard deviation for each variablefoo <- pca_prep %>% recipes::tidy(1)foo
# A tibble: 80 × 4
terms statistic value id
<chr> <chr> <dbl> <chr>
1 light_rum mean 0.161 normalize_zkFR2
2 lemon_juice mean 0.229 normalize_zkFR2
3 lime_juice mean 0.138 normalize_zkFR2
4 sweet_vermouth mean 0.0691 normalize_zkFR2
5 orange_juice mean 0.185 normalize_zkFR2
6 powdered_sugar mean 0.0891 normalize_zkFR2
7 dark_rum mean 0.0454 normalize_zkFR2
8 cranberry_juice mean 0.0363 normalize_zkFR2
9 pineapple_juice mean 0.0608 normalize_zkFR2
10 bourbon_whiskey mean 0.0768 normalize_zkFR2
# ℹ 70 more rows
# we can just filter to find the largest mean value# first, isolate the mean valuesfoo %>% dplyr::filter(statistic =='mean') %>%# once we have just the mean values, filter out the row with the max value dplyr::filter(value ==max(value))
# A tibble: 1 × 4
terms statistic value id
<chr> <chr> <dbl> <chr>
1 gin mean 0.252 normalize_zkFR2
On average, it is gin that is the largest component of the cocktails, with just over 1/4 oz per cocktail.
Exercise 5
Now look at the result of the PCA, applying the recipes::tidy verb as before, but with second argument = 2. Save the result in a variable and filter for the components PC1 to PC5. Mutate the resulting component column so that the values are factors, ordering them in the order they appear using the forcats::fct_inorder verb.
How would you describe the drinks represented by PC1?
SOLUTION:
# the tidy operation shows the weights of each ingredient, # for each principal component.# - i.e. PC1 (along with PC2 - PC5) is a weighted sum of ingredientsbar <- pca_prep %>% recipes::tidy(2)bar
# plot to show the ingredient weightsbar %>%# since there are only 5 components, this is redundant dplyr::filter(component %in%paste0("PC", 1:5)) %>%# change component from a character to a factor, and give them an order dplyr::mutate(component = forcats::fct_inorder(component)) %>%# plotggplot(aes(value, terms, fill = terms)) +geom_col(show.legend =FALSE) +facet_wrap(~component, nrow =1) +labs(y =NULL) +theme(axis.text=element_text(size=7),axis.title=element_text(size=14,face="bold"))
Exercise 6
As in the last exercise, use the variable with the tidied PCA data and use only PCA components PC1 to PC4. Take/slice the top 8 ingedients by component, ordered by their absolute value using the verb dplyr::slice_max. Next, generate a grouped table using gt::gt, colouring the cell backgrounds (i.e. fill) with green for values and red for values .
What is the characteristic alcoholic beverage of each of the first 4 principle components.
SOLUTION:
bar %>%# filter our the rows for PC1 - PC4 dplyr::filter(component %in%paste0("PC", 1:4)) %>%# group by component (i.e. principal component) dplyr::group_by(component) %>%# for each group, take the top 8 ingredients by absolute value dplyr::slice_max(n =8, order_by =abs(value)) %>%# now make a nicely formatted table gt::gt() %>%# make/apply a table style: this one for values < 0 gt::tab_style(style =list( gt::cell_fill(color ="red"), gt::cell_text(weight ="bold") ),locations = gt::cells_body(columns = value,rows = value <0 ) ) %>%# make/apply another table style: this one for values >= 0 gt::tab_style(style =list( gt::cell_fill(color ="green"), gt::cell_text(weight ="bold") ),locations = gt::cells_body(columns = value,rows = value >=0 ) ) %>%# apply a theme; any theme will do gtExtras::gt_theme_espn()
terms
value
id
PC1
powdered_sugar
-0.4764442
pca_wpByh
simple_syrup
0.3125978
pca_wpByh
whole_egg
-0.2903794
pca_wpByh
egg_white
-0.2643614
pca_wpByh
gin
-0.2588481
pca_wpByh
port
-0.2354010
pca_wpByh
lime_juice
0.2240295
pca_wpByh
blanco_tequila
0.2025436
pca_wpByh
PC2
dry_vermouth
0.4332412
pca_wpByh
sweet_vermouth
0.3754366
pca_wpByh
powdered_sugar
-0.3273723
pca_wpByh
lemon_juice
-0.2651836
pca_wpByh
simple_syrup
-0.2512150
pca_wpByh
gin
0.2437374
pca_wpByh
whole_egg
-0.2191547
pca_wpByh
port
-0.1763370
pca_wpByh
PC3
gin
0.3780348
pca_wpByh
egg_white
0.3094631
pca_wpByh
benedictine
-0.2864463
pca_wpByh
whole_egg
-0.2825812
pca_wpByh
blended_scotch_whiskey
-0.2352922
pca_wpByh
lemon_juice
0.2318927
pca_wpByh
vodka
-0.2167814
pca_wpByh
apricot_flavored_brandy
0.2134899
pca_wpByh
PC4
grenadine
0.4068737
pca_wpByh
orange_juice
0.3625622
pca_wpByh
vodka
0.3008406
pca_wpByh
simple_syrup
-0.2533007
pca_wpByh
half_and_half
0.2493552
pca_wpByh
egg_white
0.2465323
pca_wpByh
cranberry_juice
0.2228099
pca_wpByh
white_creme_de_cacao
0.2089063
pca_wpByh
Principal components and similar methods are very useful in reducing the complexity of our models. In this case we reduced the 40 original predictors to just 5 predictors.
The challenge with using these methods is attaching meaning to the revised predictors, and this is important when we need to explain our models. The computer can compute the new predictors but they can’t tell us what they represent. For this we need to look at the structure of the new predictors and see if we can attach some meaning to them; often the solution is to give them names that capture the underlying structure.
In this case, looking at the ingredients that make up the PCA predictors
PC1 represents a drink with:
little or no sugar, egg, gin or port; some or a lot of syrup and citrus juice
most often / mainly tequila
PC2 represents a drink with:
little or no sugar, syrup, or citrus juice
most often / mainly vermouth
PC3 represents a drink with:
little or no egg, whiskey or vodka
most often / mainly gin
PC4 represents a drink with
little or no syrup; some or a lot of juice and dairy product
most often / mainly grenadine and vodka
Exercise 7
For this exercise, bake the prepped PCA recipe using recipes::bake on the original data and plot each cocktail by its PC1, PC2 component, using
Can you create an interpretation of the PCA analysis?
SOLUTION:
# bake the dataset cocktails_df, creating a new datasetrecipes::bake(pca_prep, new_data = cocktails_df) %>%ggplot(aes(PC1, PC2, label = name)) +geom_point(aes(color = category), alpha =0.7, size =2) +geom_text(check_overlap =TRUE, hjust ="inward") +labs(color =NULL)
In this exercise we are plotting the cocktails against the first two principal components and trying to interpret the results.
The interpretation is more difficult now than it was in the last exercise where we found an interpretation for each principal component. Now we are looking at each cocktail in terms of combinations of PC1 and PC2, both positive and negative.
It appears that the lower left quadrant (negative PC1 and PC2 values) are mainly cocktail classics. If you look at the components of PC1 and PC2 from the last exercise and negate them, you can describe the cocktails in this quadrant: they have egg /egg-white, port, sugar, no vermouth or gin or tequila.
The lower right quadrant is positive PC1 and negative PC2. PC1 contains juice and syrup, while PC2 is negative juice and syrup - so in this quadrant we have cocktails that are like PC1 (juice and syrup) and unlike PC2 (juice and syrup & sugar). The cocktails in this quadrant have citrus juice, are sweet from the use of syrup & sugar and likely have tequila.
The top half of the plot shows cocktails clustered along the PC1=0 axis, so those are mainly tequila cocktails, very much like PC2.
Is there an opportunity to create a new cocktail in the upper left or upper right quadrants?